#message=FALSE means some output messages aren't printed in the knitted HTML document

knitr::opts_chunk$set(echo = FALSE, warning=FALSE) 
#include=FALSE, echo=FALSE and warning=FALSE ensure the code and any error messages aren't shown in the output HTML file

#message=FALSE means no output messages are printed on the knitted HMTL file
#loading any necessary packages
library(dplyr)
library(tidyverse) 
library(knitr)
library(readr)
library(ggplot2)
library(bbmle)
library(mizer) 
library(Hmisc)
library(lme4)
library(RColorBrewer)
library(viridis)
library(hexbin)
load("stomach_dataset.Rdata")
#loading the dataset to be used in analysis (the dataset is already named stom_df)

df <- stom_df %>% transmute(haul_id, ices_rectangle, year, pred_species, 
                            pred_weight_g, pred_length_cm, prey_weight_g, 
                            prey_type = prey_funcgrp, indiv_prey_weight = prey_ind_weight_g, 
                            prey_count, n_stomachs, no._prey_per_stmch = prey_count/n_stomachs, ppmr) 
#creating a new data set called 'df' which only contains certain selected columns

df <- df[df$indiv_prey_weight != 0, ]
df <- df[df$pred_weight_g != 0, ]
df <- df[df$indiv_prey_weight != Inf, ]
#Removes any points where the prey weight = Inf or 0, or the predator weight = 0
renamed_df = df %>% 
  mutate(pred_species = replace(pred_species, 
                                pred_species == "Micromesistius poutassou", "Blue Whiting")) %>%
#  mutate(pred_species = replace(pred_species, pred_species == "Capros apers", "Boarfish")) %>%
  mutate(pred_species = replace(pred_species, pred_species == "Gadus morhua", "Cod")) %>% 
  mutate(pred_species = replace(pred_species, pred_species == "Limanda limanda", "Common Dab")) %>%
  mutate(pred_species = replace(pred_species, 
                                pred_species == "Merluccius merluccius", "European Hake")) %>%  
  mutate(pred_species = replace(pred_species, pred_species == "Melanogrammus aeglefinus", "Haddock")) %>%  
  mutate(pred_species = replace(pred_species, pred_species == "Clupea harengus", "Herring")) %>% 
  mutate(pred_species = replace(pred_species, pred_species == "Trachurus trachurus", "Horse Mackerel")) %>%  
  mutate(pred_species = replace(pred_species, pred_species == "Scomber scombrus", "Mackerel")) %>%
  mutate(pred_species = replace(pred_species, pred_species == "Lepidorhombus whiffiagonis", "Megrim")) %>%
  mutate(pred_species = replace(pred_species, pred_species == "Lophius piscatorius", "Monkfish")) %>% 
  mutate(pred_species = replace(pred_species, pred_species == "Trisopterus esmarkii", "Norway Pout")) %>%  
  mutate(pred_species = replace(pred_species, pred_species == "Pleuronectes platessa", "Plaice")) %>%
  mutate(pred_species = replace(pred_species, pred_species == "Trisopterus minutus", "Poor Cod")) %>%  
  mutate(pred_species = replace(pred_species, pred_species == "Solea solea", "Sole")) %>% 
  mutate(pred_species = replace(pred_species, pred_species == "Sprattus sprattus", "Sprat")) %>%
  mutate(pred_species = replace(pred_species, pred_species == "Merlangius merlangus", "Whiting"))
#Renaming the predator species from their latin names (e.g. Capros apers) to their more common names (e.g. Boarfish)
#Creates a new data frame (called 'renamed_df') with these replaced names

species_list <- c("Blue Whiting", "Cod", "Common Dab", "European Hake", "Haddock", "Herring",
                     "Horse Mackerel", "Mackerel", "Megrim", "Monkfish", "Norway Pout",  "Plaice",
                     "Poor Cod", "Sole", "Sprat", "Whiting")
#Creates an array called 'species_list', which is list of the predator species we are focusing on in this project

renamed_df <- renamed_df[renamed_df$pred_species %in% species_list, ]
#Removes any observations of predator species not in the species_list, i.e. they are irrelevant data points for this project

renamed_df$lppmr <- log(renamed_df$ppmr)
renamed_df$lprey_weight <- log(renamed_df$indiv_prey_weight)
renamed_df$lpred_weight <- log(renamed_df$pred_weight_g)
#Adds columns which take the log value of ppmr, individual prey weight and predator weight to the main data set

1 Introduction

This data set is formed from recordings taken by multiple ships between the years 1886-2016.

Fish were taken from some location, and their individual stomach contents recorded. Predators of the same species and (roughly) the same weight/size were recorded as a single data point (the number of predators for each data set is called ‘n_stomachs’). Prey of (roughly) the same size found in these stomachs were recorded in this same data point, and the total number of prey in some data point is called ‘prey_count’. For each individual data point,

\[ \text{no. prey per stomach} = \frac{\text{no. of stomachs sampled}}{\text{total no. of prey}} \] This is an approximation of the number of prey per stomach for some specific predator, and is called ’ no._prey_per_stmch’.

Using similar logic,‘prey_weight_g’ is the total weight of prey in the multiple stomachs sampled to make up one datapoint. ‘indiv_prey_weight’ is then the weight of one single prey individual found in the stomach

There are some other column names which are useful to note:

2 Distribution of prey type eaten for each predator

ggplot(renamed_df, aes(x=prey_type, fill=prey_type)) +
  geom_bar(stat="count", width=0.7) +
  facet_wrap(~renamed_df$pred_species, scale="free_y") +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())  +
  scale_fill_brewer(palette = "Dark2") +
  labs(title = "Distribution of prey types for each predator", x="Prey type", y="Count")

#geom_bar() creates a bar chart from the specified data
#facet_wrap() means an individual bar chart is created for each individual predator species
#'theme()' removes any words on the x-axis so the graphs are easier to understand
#scale_fill_brewer changes the colour palette of the plot so that the colours are easy to distinguish between

These are individual bar charts showing the distribution of the type of prey each predator eats.

The prey types are:

We can see that some predators eat a range of prey types (e.g. Whiting), while others prefer to eat a single prey type in abundance (e.g. European Hake and Monkfish mostly consume fish as their prey).

3 Most common prey weights

ggplot(data = renamed_df, aes(lprey_weight, no._prey_per_stmch)) + 
      labs(title = "log(prey weight) v. number of prey per predator stomach", 
       x="log(prey weight)", y="No. of prey per predator stomach") + 
      geom_point(size=0.5)

#geom_point() adds individual points which show individual recorded points
#size=0.5 defines the size of each point on this graph

This graph is looking at the distribution of the weight of prey recorded, i.e. looking at what is the most common prey weight over all prey species. It is a graph over all the data points, so includes recordings from all the ships involved.

There are some potentially interesting results, so results from different ships were plotted on individual curves to further look into these results.

renamed_df$'haul_id_short' <- gsub("\\-.*", "", renamed_df$'haul_id')
renamed_df$'haul_id_short' <- gsub("_", "", renamed_df$'haul_id_short')
#the haul_id values are renamed to be shortened versions of the ship names (e.g. CLYDE) rather than the complete id (e.g. CLYDE-1935-6)

interesting_haul <- filter(renamed_df, 
                           haul_id_short=='CLYDE'|haul_id_short=='END04'|haul_id_short=='LUC'|
                             haul_id_short=='HIDDINK'|haul_id_short=='EXCmacDATSTO815'|
                             haul_id_short=="Excmacdatsto815error")
#interesting_haul is a new data frame which only contains values recorded by ships which gave "interesting" looking datapoints

haul_list_interesting <- unique(interesting_haul$haul_id_short)
#creates an array containing every individual (renamed) ship name that we are interested in

ggplot (data = interesting_haul, aes(x=lprey_weight, y=no._prey_per_stmch)) + 
  labs(title = "log(prey weight) v. number of prey per stomach - separated by ship names",
       x="log(prey weight)", y="No. prey per stomach") + 
  geom_point(size=0.02, colour="red") +
  theme(strip.text = element_text(size = 5)) + 
  facet_wrap(~interesting_haul$haul_id_short, scale="free_y")

These six graphs show a range of unusual looking points:

  1. \(y \propto e^{-x}\) relation for END04 (i.e. no. per stomach is proportional to 1/prey weight)
  2. lots of observations for single prey weights for LUC and CLYDE
  3. lots of the same no. of prey per stomach observations for HIDDINK.
  4. the EXCmacDATSTO815 and Excmacdatsto815error seem to represent exactly the same data

Though we will not remove or alter the data set to account for these potentially erroneous data points in this project, for further analysis it may be sensible to remove the data points these six ships recorded so that final analysis is as reliable as possible.

4 Prey and predator weight relation

#message=FALSE means some output messages aren't printed in the knitted HTML document

ggplot(data = renamed_df, aes(lprey_weight, lpred_weight)) + 
  labs(title = "log(Predator mass) v. log(prey mass) plot", 
       x="log(Prey mass)", y="log(Predator mass)") + 
  geom_hex(bins = 50) +
  scale_fill_gradient2(low = "black", mid="orange", high = "red", midpoint=600) +
  stat_smooth(method='lm', se=FALSE, colour="blue")

# stat_smooth adds a line of best fit plotted through the points
# method='lm' ensures it is a straight line, i.e. a linear model
# se=FAlSE creates only a single line, with no error of margin included

#geom_hex creates "bins" in the data where the density is calculated over
#bins=50 means that there are 50 "bins" in the horizontal direction and 50 in the vertical
#scale_fill_gradient2 creates a density colour scale, meaning that areas with a high density of points are red and areas with a low density of points are darker in colour

slope <- coef(lm(renamed_df$lpred_weight~renamed_df$lprey_weight))
paste("slope of the log(pred) v. log(prey) line of best fit:", slope[2])
## [1] "slope of the log(pred) v. log(prey) line of best fit: 0.489336522809739"
# coef(lm()) creates an array containing the coefficients of the linear model of the relationship between two axes
# The first item of the array gives out the y-intercept of the line, and the second item of the array is the gradient

We are attempting to find a link between the predator mass and the prey mass. Log() of each axis is used to see the proportionality of the axes, where the slope of the added line should = 1. This is because:

\[ \log (\text{pred weight}) = m \times \text{log(prey weight)} + c \]

Where m is the gradient of the slope and c is the y-intercept (thinking of this graph as a linear model).

Taking the exponential of both sides,

\[ \text{pred weight} = \exp({{m}\times{\log(\text{prey weight})}}) + \log(D) \] where log(D) = c. 

Finally,

\[ \text{pred weight} = \text{prey weight}^m \times D \]

Hence, we want m=1 to show that pred weight proportional to prey weight (as expected).

For this graph, the slope is not equal to one. However, this graph is plotted over the entire data set and includes values from all of the available predator species.

Instead, separate the plots by predator species to see if there is some proportionality between the predator and prey weight for each specific species.

4.1 Predator v. prey weight plot, separated by predator species

ggplot(data = renamed_df, aes(lprey_weight, lpred_weight)) + 
  labs(title = "log(pred. mass) v. log(prey mass) separated by predator species", 
       x="log(prey mass)", y="log(pred. mass)") +
  geom_point(size=0.2) + 
  facet_wrap(~pred_species, scale="free_y") + 
  theme(strip.text = element_text(size = 10)) +
  stat_smooth (method='lm', se=FALSE, colour="red")

4.2 Gradients of the plots for each predator species

i <- 1
species_grad = c()
#setting i=1 for the while loop
#and creating an empty vector called 'species_grad' to hold data

while(i<=length(species_list)){
  species_df <- renamed_df %>% filter(pred_species == fixed(species_list[i]))
  grad <- coef(lm((species_df$lpred_weight)~(species_df$lprey_weight)))
  species_grad[i] <- grad[2]
  i=i+1
}

#'while' means that this function repeats through the length of 'species_list' until all predator species are accounted for
#i=i+1 increases the value of i each time through this loop to motivate this repeat
#this creates a data frame (species_df) containing each indivdual predator species, then calculates the gradient of the log(pred weight) v. log(prey weight) graph for this individual predator species, then inserts this gradient value to the ith value of an array named 'species_grad'

pvp_grads <- data.frame(pred_species=c(species_list),
                        gradient=c(species_grad))

kable(pvp_grads)
pred_species gradient
Blue Whiting -0.0258049
Cod 0.8343249
Common Dab 0.1217269
European Hake 0.7230261
Haddock 0.2604050
Herring 0.2220153
Horse Mackerel 0.2290883
Mackerel 0.0156493
Megrim 0.3020678
Monkfish 0.7009350
Norway Pout 0.4455201
Plaice 0.4548590
Poor Cod 0.1394437
Sole 0.1141953
Sprat 0.0381477
Whiting 0.6664210
#creates a new data frame name 'pvp_grads', with one column named 'pred_species' and the other named 'gradient'
#each row contains the name of some individual predator species and its associated gradient
#kable prints the information in the pvp_grads

Here, we are splitting up the graphs over each individual predator species to look and see if there is a proportionality for each specific species.

We are wanting gradient=1 for proportionality. This is only true (to 1 significant figure) for the species Cod, European Hake, Monkfish and Whiting. Although this is a very limited number of the predator species, the assumption that the predator and prey masses are proportional is a reasonable one, so we will continue wiht this assumption.

5 Predator weight against ppmr

5.1 Plotting predator weight against ppmr, separated by predator species

ggplot(data=renamed_df, aes(lpred_weight, lppmr)) + 
  geom_point(size=0.001) +
  labs(title = "log(pred mass) v. log(ppmr) plot", x="log(Pred mass)", y="log(PPMR)") + 
  stat_smooth (method='lm', se=FALSE) + 
  facet_wrap(~pred_species, scale="free_y")

Graphing log(pred mass) v. log(ppmr) for individual predator species to see if the predator mass related to the ppmr.

\[ \log(\text{pred mass}) = m \times \log (\text{prey mass}) + c \]

where m is the gradient (assuming a linear model between the variables) and c is the y-intercept.

We want them to not be proportional (i.e. slope, m= 0) to prove that log(predator mass) is not related to the log(ppmr) (and hence the ppmr is independent of the predator mass for a certain species).

5.2 Gradients of the plots

species_grad_ppmr=c()
j <- 1
#creates an empty vector called 'species_grad_ppmr'
#setting j=1 for the while loop

while(j<=length(species_list)){
  species_df <- renamed_df %>% filter(pred_species == fixed(species_list[j]))
  grad <- coef(lm((species_df$lppmr)~(species_df$lpred_weight)))
  species_grad_ppmr[j] <- grad[2]
  j=j+1
}

ppmrvp_grads <- data.frame(pred_species=c(species_list),
                        gradient=c(species_grad_ppmr))

kable(ppmrvp_grads)
pred_species gradient
Blue Whiting 1.2032364
Cod 0.2709018
Common Dab 0.2435919
European Hake 0.1617011
Haddock 0.2581471
Herring 0.7232217
Horse Mackerel 0.4273000
Mackerel 0.7621392
Megrim -0.2055083
Monkfish 0.1811160
Norway Pout 0.0973909
Plaice 0.3061335
Poor Cod -0.0895529
Sole 0.6335299
Sprat 0.4688564
Whiting 0.2891959

For the assumption to be upheld, we need gradient=0. This is (approximately) satisfied for the species Cod, Common Dab, Euorpean Hake, Haddock, Megrim, Monkfish, Norway Pout and Poor Cod. Although none of these are exactly equal to 0, they are sufficiently close to continue with the assumption that the ppmr is independent of the predator mass for the named predator species.

5.3 Different weightings

species_df <- renamed_df %>% filter(pred_species == fixed("Poor Cod"))
#creating a data frame only containing observations where the predator species is poor cod.

ggplot(data=species_df, aes(lpred_weight, lppmr)) + 
  geom_point(size=0.5) +
  labs(title = "log(pred mass) v. log(ppmr) plot: Poor Cod", x="log(Pred mass)", y="log(PPMR)") + 
  facet_wrap(~pred_species, scale="free_y") + 
  stat_smooth(aes(weight=no._prey_per_stmch, colour='by no. of prey in stomach'), method='lm', se=FALSE) +
  stat_smooth(aes(colour='no weighting'), method='lm', se=FALSE) +
  stat_smooth(aes(weight=prey_weight_g, colour='by prey bio'), method='lm', se=FALSE) +
  stat_summary(fun.data=mean_se, geom="linerange") +
  scale_colour_manual(name="Weightings",
                     values=c('by no. of prey in stomach'='blue', 
                              'no weighting'='red', 'by prey bio'='green'))

# se=FALSE doesn't add an area of error around the LOBF 
# fun.data=mean_se calculates the mean and standard error for each point
# linerange draws a point range between an upper and lower limit for the line, and the mean for the point (using the values calcuated in 'fun.data=mean_se')

paste("slope of the log(ppmr) v. log(pred_weight) line of best fit for poor cod:")
## [1] "slope of the log(ppmr) v. log(pred_weight) line of best fit for poor cod:"
#weighted by number of prey per stomach
perstmch_weighting <- 
  coef(lm(species_df$lppmr~species_df$lpred_weight, weight=species_df$no._prey_per_stmch))
paste("i) Weighted by no prey in the stomach:", perstmch_weighting[2])
## [1] "i) Weighted by no prey in the stomach: -1.64313946193691"
#no weighting
no_weighting <- coef(lm(species_df$lppmr~species_df$lpred_weight))
paste("ii) No weighting in the calculation:", no_weighting[2])
## [1] "ii) No weighting in the calculation: -0.0895529388001952"
#weighted by prey biomass
biomass_weighting <- 
  coef(lm(species_df$lppmr~species_df$lpred_weight, weight=species_df$prey_weight_g))
paste("iii) Weighted by prey biomass:", biomass_weighting[2])
## [1] "iii) Weighted by prey biomass: 0.0116668819369047"

Looking at just the predator species ‘poor cod’. There are three LOBF (line of best fit), weighted by different variables:

  1. blue: by number of prey per stomach
  2. red: no weighting in the LOBF
  3. green: by prey biomass

When weighted by prey biomass, the gradient is equal to zero for 2 s.f.. Hence, the approximation (of slope=0) is true when looking at a prey biomass weighting.

6 Average PPMR for individual species

Here, we are looking for the most common log(PPMR) for each individual species. This will be done by plotting log(PPMR) against the density of points, and weighting observations on different variables. By assuming these are normally distributed relations, there should be a ‘peak’ point of log(ppmr) which is where the mean/most common log(ppmr) value lies. We assume this will be different for different species of predator, i.e. each predator has a preferred ppmr value and hence each predator type has a preferred relative size of prey.

6.1 Weighted by prey biomass:

ggplot(data = renamed_df, aes(x=lppmr)) + 
  labs(title = "log(PPMR) v. biomass density of prey", x="log(PPMR)", y="Biomass density of prey") +
  facet_wrap(~renamed_df$pred_species, scale="free_y") + 
  theme(strip.text = element_text(size = 5)) +
  geom_density(aes(weight = prey_weight_g), colour="red")

#using facet_wrap for the variable (pred_species) means the data is separated into individual plots for each predator species
#geom_density means area under the curve = 1 (i.e. the graph is normalised)
#weight=prey_weight_g means the points are 'weighted' by the column weight (biomass) of each individual prey

These plots are weighted by the prey biomass. This means that we are looking at the distribution of prey biomass across values of log(PPMR), so points with a larger biomass are prioritised/weighted more than those with a smaller biomass.

6.2 Weighted by number of prey:

ggplot(data = renamed_df, aes(x=lppmr)) + 
  labs(title = "log(PPMR) v. number density of prey", x="log(PPMR)", y="Number density of prey") +
  facet_wrap(~renamed_df$pred_species, scale="free_y") + 
  theme(strip.text = element_text(size = 5)) +
  geom_density(aes(weight = no._prey_per_stmch), colour="red")

#weight=no._prey_per_stmch means the points are 'weighted' by the number of prey per stomach

These plots are weighted by the number pf prey per stomach. This means that data points with a larger number of prey per stomach are prioritised/weighted more than those with a smaller number of prey per stomach.

By looking at the two differently weighted graphs, it is clear to see that the plots are ‘shifted’ by some amount (e.g. the Blue Whiting when weighted by prey biomass has a mean log(ppmr) of roughly 4.8, and when weighted by number of prey this mean becomes roughly 7.5).

7 Specific PPMR calculations by different weightings for Herring species

Here, we take only observations relating to the predator type Herring. This allows us to do more specific analysis about the shifted mean and begin to explain some of the maths behind this shifting.

herringDF <- renamed_df %>% 
    filter(pred_species == fixed("Herring"))
#creates a separate data set (called 'herringDF') only containing observations for the predator species 'Herring'

7.1 Weighted by prey biomass

bio_herringmean = weighted.mean(herringDF$lppmr, w = herringDF$prey_weight_g, na.rm = TRUE)
bio_herringSD = sqrt(wtd.var(herringDF$lppmr, w = herringDF$prey_weight_g, na.rm = TRUE))

#weighted.mean gives the arithmetic mean of log(ppmr), where datapoints are weighted by the prey weight of observations
#similarly, wtd.var is the variance of log(ppmr), where the datapoints are also weighted by the prey weight
#standard deviation is defined as the square root of the variance
#na.rm=TRUE means any rows with missing values (values that equal 'na') aren't included in the mean/variance calculations, but are instead ignored

paste("log(ppmr) mean, weighted by biomass of prey:", bio_herringmean)
## [1] "log(ppmr) mean, weighted by biomass of prey: 6.62917673459157"
paste("Standard deviation of this:", bio_herringSD)
## [1] "Standard deviation of this: 2.3508789268221"
ggplot(data = herringDF, aes(x=lppmr)) + 
          labs(title = "log(ppmr) against biomass density for Herring, weighted by prey biomass", 
               x="log(PPMR)",y="Biomass density of observations") +
          geom_density(aes(weight = prey_weight_g), colour="blue") + 
          theme(plot.title = element_text(size=15)) + 
          stat_function(fun = dnorm, args= with(herringDF, c(mean = bio_herringmean, sd = bio_herringSD))) +
          xlim(-5,17)

#stat_function adds a normal distribution curve (fun=dnorm) with a mean and standard deviation equal to what was calculated earlier for this data set
#xlim sets limits for the x-axis so that all the necessary data points can be seen

The curve with points weighted by the prey biomass is plotted in blue, and a normal curve (also weighted by prey biomasss) is plotted over the top in black.

7.2 Weighted by number of prey

no_herringmean = weighted.mean(herringDF$lppmr, w = herringDF$no._prey_per_stmch, na.rm = TRUE)
no_herringSD = sqrt(wtd.var(herringDF$lppmr, w = herringDF$no._prey_per_stmch, na.rm = TRUE))
#the mean and variance are both weighted by the number of prey

paste("log(ppmr) mean, weighted by number of observations:", no_herringmean)
## [1] "log(ppmr) mean, weighted by number of observations: 13.5410236346048"
paste("Standard deviation of this:", no_herringSD)
## [1] "Standard deviation of this: 2.63330496940788"
ggplot(data = herringDF, aes(x=lppmr), group=1) + 
          labs(title = "log(ppmr) against number density for Herring, weighted by no. of prey",
               x="log(PPMR)", y="No. density of observations") +
          geom_density(aes(weight = no._prey_per_stmch), colour="red") + 
          theme(plot.title = element_text(size=15)) +
          stat_function(fun = dnorm, args= with(herringDF, c(mean = no_herringmean, sd = no_herringSD))) +
          xlim(0,25)

#the xlim is different to the graph above because these data points lie over a slightly different log(ppmr) range

The curve with points weighted by the number of prey per stomach is plotted in red, and a normal curve (also weighted by the number of prey per stomach) is plotted over the top in black.

7.3 Combined graph

ggplot(data = herringDF, aes(x=lppmr), group=1) + 
          labs(title = "log(ppmr) against biomass/number density for Herring,
               with different weightings",
               x="log(PPMR)", y="Number/biomass density of observations") +
          geom_density(aes(weight = no._prey_per_stmch, colour="prey biomass")) + 
          geom_density(aes(weight = prey_weight_g, colour="number of prey per stomach")) + 
          stat_function(fun = dnorm, args= with(herringDF, c(mean = no_herringmean, sd = no_herringSD))) +
          stat_function(fun = dnorm, args= with(herringDF, c(mean = bio_herringmean, sd = bio_herringSD))) +
          theme(plot.title = element_text(size=15)) +
          scale_color_manual(name='Weightings',
                     values=c('number of prey per stomach'='red', 'prey biomass'='blue')) +
        xlim(-5,25) 

#scale_color_manual manually adds a key to the graph to describe what the differently coloured lines represent
#each curve is given a name using 'aes(colour="")', and these are then given a specific colour using 'values='

This is a graph with both the the distribution of Herring log(PPMR) as weighted by prey biomass and number of prey plotted over each other (along with appropriately weighted normal distribution curves plotted over the top of each).

Prey biomass weighted mean: 6.629177 No. of prey weighted mean: 13.54102

Prey biomass standard deviation: 2.3508789268221 No. of prey standard deviation: 2.63330496940788

The mean is shifted by 6.911843.

The mathematics of shifted means for this difference in weighting is:

\[ \text{weighted mean}_{\text{expected prey biomass}} = \text{weighted mean}_{\text{no. of prey }} - (\text{standard deviation}_{\text{no. of prey }})^2 \]

Hence, the expected result is:

\[ \text{weighted mean}_{\text{actual prey biomass}} = 13.54102 - (2.63330496940788)^2 = 6.60672493809 \] There is a difference in expected and actual prey biomass weighted mean of 0.30511806191. This is a fairly insignificant value, which suggests the shifting mean equations are relatively accurate for this data set with these weightings.

Therefore, we can continue with the assumption that there is a difference in the mean of log(ppmr) of the square of the standard deviation when we weight by number of prey per stomach and prey biomass.

8 Introducing a mixed effects model

8.1 Building up the model

8.1.1 Basic Linear model

With $() = m () + c

renamed_df %>% 
    ggplot(aes(x=lprey_weight, y=lppmr, group=pred_species, color=pred_species)) + 
    geom_line(size=1) + theme_classic()

8.1.2 Null

Fixed effects: no fixed effects

Renamed effects: pred_species

null <- lmer(lppmr ~ (1|pred_species), data = renamed_df, REML=FALSE)
#the 'null' model has no predictors (i.e. no linear effects)
#REML=FALSE means the model is made using 'maximum likelihood' (rather than Restricted Maximum Likelihood), which allows us to compare different models

renamed_df %>% 
    mutate(lppmr = fitted(null)) %>% 
    ggplot(aes(x=lprey_weight, y=lppmr, group=pred_species, color=pred_species)) +
    theme_classic() +
    geom_line(size=1) 

Mathematical representation: \[ \log(\text{ppmr})_{ij} = \beta_{0} + S_{0[j]} + \epsilon_{ij} \]

This model has the following variables:

8.1.3 First trial

Fixed effects: lprey_weight

Renamed effects: pred_species

one <- lmer(lppmr ~ lprey_weight + (1|pred_species), data = renamed_df, REML=FALSE)
#same slope for every effect (but varying intercept)

renamed_df %>% 
    mutate(lppmr= fitted(one)) %>% 
    ggplot(aes(x=lprey_weight, y=lppmr, group=pred_species, color=pred_species)) + 
    theme_classic() +
    geom_line(size=1) 

8.1.4 Second trial

Fixed effects: lprey_weight

Renamed effects: pred_species (random slope and fixed intercept; varies w.r.t lprey_weight)

two <- lmer(lppmr ~ lprey_weight + (0+lprey_weight|pred_species), data = renamed_df, REML=FALSE)
#Random intercept and random slope (independent)

renamed_df %>% 
    mutate(lppmr = fitted(two)) %>% 
    ggplot(aes(x=lprey_weight, y=lppmr, group=pred_species, color=pred_species)) + 
    theme_classic() +
    geom_line(size=1) 

8.1.5 Third trial

Fixed effects: lprey_weight

Renamed effects: pred_species (random slope and random intercept; varies w.r.t lprey_weight)

three <- lmer(lppmr ~ lprey_weight + (1+lprey_weight|pred_species), data = renamed_df, REML=FALSE)
#Random intercept and random slope (correlated)

renamed_df %>% 
    mutate(lppmr = fitted(three)) %>% 
    ggplot(aes(x=lprey_weight, y=lppmr, group=pred_species, color=pred_species)) + 
    theme_classic() +
    geom_line(size=1) 

8.1.6 Comparing the model

anova(null,one,two,three)
## Data: renamed_df
## Models:
## null: lppmr ~ (1 | pred_species)
## one: lppmr ~ lprey_weight + (1 | pred_species)
## two: lppmr ~ lprey_weight + (0 + lprey_weight | pred_species)
## three: lppmr ~ lprey_weight + (1 + lprey_weight | pred_species)
##       npar     AIC     BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## null     3 1188058 1188090 -594026  1188052                         
## one      4 1084789 1084831 -542391  1084781 103271  1  < 2.2e-16 ***
## two      4 1047701 1047743 -523847  1047693  37088  0               
## three    6 1024125 1024188 -512057  1024113  23580  2  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#comparing the four models

#(1+lprey_weight|pred_species)
#(1+pred_species|haul_id_short) --> doesn't compile?
#couldn't introduce (1+pred_species|haul_id_short) or (1+lprey_weight|haul_id_short)

8.2 Trialing different models

Here, all the random effects are modelled with a randomly distributed slope and intercept.

8.2.1 Trial a

Fixed effects: lprey_weight

Renamed effects: haul_id_short

a <- lmer(lppmr ~ lprey_weight + (1|haul_id_short), data = renamed_df, REML=FALSE)
a
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: lppmr ~ lprey_weight + (1 | haul_id_short)
##    Data: renamed_df
##       AIC       BIC    logLik  deviance  df.resid 
##  837098.1  837140.1 -418545.1  837090.1    267427 
## Random effects:
##  Groups        Name        Std.Dev.
##  haul_id_short (Intercept) 1.937   
##  Residual                  1.156   
## Number of obs: 267431, groups:  haul_id_short, 110
## Fixed Effects:
##  (Intercept)  lprey_weight  
##       5.1547       -0.7336
hist(resid(a), border='red', col="transparent", main = paste("Histogram of residuals" ))

#Creates a histogram of the distribution of the residuals
print(VarCorr(a),comp="Variance")
##  Groups        Name        Variance
##  haul_id_short (Intercept) 3.7532  
##  Residual                  1.3358
#Only prints the variance estimates for the random effects
paste("AIC:", AIC(a))
## [1] "AIC: 837098.118608064"
#Prints the AIC value for this model

8.2.2 Trial b

Fixed effects: lprey_weight

Renamed effects: haul_id_short, year

b <- lmer(lppmr ~ lprey_weight + (1|haul_id_short) + (1|year), data = renamed_df, REML=FALSE)
b
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: lppmr ~ lprey_weight + (1 | haul_id_short) + (1 | year)
##    Data: renamed_df
##       AIC       BIC    logLik  deviance  df.resid 
##  826114.8  826167.2 -413052.4  826104.8    267263 
## Random effects:
##  Groups        Name        Std.Dev.
##  haul_id_short (Intercept) 1.991   
##  year          (Intercept) 1.329   
##  Residual                  1.132   
## Number of obs: 267268, groups:  haul_id_short, 110; year, 100
## Fixed Effects:
##  (Intercept)  lprey_weight  
##       5.1602       -0.7405
hist(resid(b), border='red', col="transparent", main = paste("Histogram of residuals" ))

print(VarCorr(b),comp="Variance")
##  Groups        Name        Variance
##  haul_id_short (Intercept) 3.9630  
##  year          (Intercept) 1.7655  
##  Residual                  1.2824
paste("AIC:", AIC(b))
## [1] "AIC: 826114.763528339"

8.2.3 Trial c

Fixed effects: lprey_weight

Renamed effects: haul_id_short, year, ices_rectangle

c <- lmer(lppmr ~ lprey_weight + (1|haul_id_short) + (1|year) + (1|ices_rectangle), data = renamed_df, REML=FALSE)
c
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: lppmr ~ lprey_weight + (1 | haul_id_short) + (1 | year) + (1 |  
##     ices_rectangle)
##    Data: renamed_df
##       AIC       BIC    logLik  deviance  df.resid 
##  773370.5  773433.2 -386679.2  773358.5    254589 
## Random effects:
##  Groups         Name        Std.Dev.
##  ices_rectangle (Intercept) 0.4928  
##  year           (Intercept) 1.3417  
##  haul_id_short  (Intercept) 1.7640  
##  Residual                   1.0990  
## Number of obs: 254595, groups:  
## ices_rectangle, 718; year, 97; haul_id_short, 97
## Fixed Effects:
##  (Intercept)  lprey_weight  
##       4.9885       -0.7448
hist(resid(c), border='red', col="transparent", main = paste("Histogram of residuals" ))

print(VarCorr(c),comp="Variance")
##  Groups         Name        Variance
##  ices_rectangle (Intercept) 0.24284 
##  year           (Intercept) 1.80010 
##  haul_id_short  (Intercept) 3.11164 
##  Residual                   1.20774
paste("AIC:", AIC(c))
## [1] "AIC: 773370.488570147"

8.2.4 Trial d

Fixed effects: lprey_weight

Renamed effects: haul_id_short, year, ices_rectangle, pred_species

d <- lmer(lppmr ~ lprey_weight + (1|haul_id_short) + (1|year) + (1|ices_rectangle) + (1|pred_species), data = renamed_df, REML=FALSE)
d
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: lppmr ~ lprey_weight + (1 | haul_id_short) + (1 | year) + (1 |  
##     ices_rectangle) + (1 | pred_species)
##    Data: renamed_df
##       AIC       BIC    logLik  deviance  df.resid 
##  761133.9  761207.0 -380559.9  761119.9    254588 
## Random effects:
##  Groups         Name        Std.Dev.
##  ices_rectangle (Intercept) 0.4334  
##  year           (Intercept) 1.4254  
##  haul_id_short  (Intercept) 1.6500  
##  pred_species   (Intercept) 0.9175  
##  Residual                   1.0729  
## Number of obs: 254595, groups:  
## ices_rectangle, 718; year, 97; haul_id_short, 97; pred_species, 16
## Fixed Effects:
##  (Intercept)  lprey_weight  
##       4.5200       -0.7583
hist(resid(d), border='red', col="transparent", main = paste("Histogram of residuals" ))

print(VarCorr(d),comp="Variance")
##  Groups         Name        Variance
##  ices_rectangle (Intercept) 0.18783 
##  year           (Intercept) 2.03163 
##  haul_id_short  (Intercept) 2.72256 
##  pred_species   (Intercept) 0.84187 
##  Residual                   1.15111
paste("AIC:", AIC(d))
## [1] "AIC: 761133.897202575"

8.2.5 Trial e

Fixed effects: lprey_weight, pred_species

Renamed effects: haul_id_short, year, ices_rectangle

e <- lmer(lppmr ~ lprey_weight + pred_species + (1|haul_id_short) + (1|year) + (1|ices_rectangle), data = renamed_df, REML=FALSE)
e
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: lppmr ~ lprey_weight + pred_species + (1 | haul_id_short) + (1 |  
##     year) + (1 | ices_rectangle)
##    Data: renamed_df
##       AIC       BIC    logLik  deviance  df.resid 
##  761051.0  761270.4 -380504.5  761009.0    254574 
## Random effects:
##  Groups         Name        Std.Dev.
##  ices_rectangle (Intercept) 0.4331  
##  year           (Intercept) 1.4235  
##  haul_id_short  (Intercept) 1.6472  
##  Residual                   1.0729  
## Number of obs: 254595, groups:  
## ices_rectangle, 718; year, 97; haul_id_short, 97
## Fixed Effects:
##                (Intercept)                lprey_weight  
##                     3.3042                     -0.7584  
##            pred_speciesCod      pred_speciesCommon Dab  
##                     2.4431                      0.9338  
##  pred_speciesEuropean Hake         pred_speciesHaddock  
##                     1.5254                      1.5661  
##        pred_speciesHerring  pred_speciesHorse Mackerel  
##                     0.5341                      1.9016  
##       pred_speciesMackerel          pred_speciesMegrim  
##                     1.1980                      1.1559  
##       pred_speciesMonkfish     pred_speciesNorway Pout  
##                     2.7481                      0.8318  
##         pred_speciesPlaice        pred_speciesPoor Cod  
##                     1.6408                      0.4747  
##           pred_speciesSole           pred_speciesSprat  
##                     1.6977                     -1.0272  
##        pred_speciesWhiting  
##                     1.8353
hist(resid(e), border='red', col="transparent", main = paste("Histogram of residuals" ))

print(VarCorr(e),comp="Variance")
##  Groups         Name        Variance
##  ices_rectangle (Intercept) 0.18761 
##  year           (Intercept) 2.02628 
##  haul_id_short  (Intercept) 2.71335 
##  Residual                   1.15104
paste("AIC:", AIC(e))
## [1] "AIC: 761051.047629465"

Mathematical representation \[ \log(\text{ppmr})_{ij} = \beta_{0} + \beta_{1} \log(\text{prey weight})_{j} + \beta_{2} (\text{predator species})_{j} + A_{0[j]} + B_{0[j]} + C_{0[j]} + \epsilon_{ij} \]

values <- data.frame(
            Variable=c('i', '$\beta_{0}$', '$\beta_{1}$', '$\beta_{2}$', 
                       '$A_{0[j]}$', '$omega_{00}$', '$B_{0[j]}$', '$C_{0[j]}$',
                       '$epsilon_{ij}$'),
             Value=c(1,2,3,4,5,6,7,8,9))  

values
##         Variable Value
## 1              i     1
## 2    $\beta_{0}$     2
## 3    $\beta_{1}$     3
## 4    $\beta_{2}$     4
## 5     $A_{0[j]}$     5
## 6   $omega_{00}$     6
## 7     $B_{0[j]}$     7
## 8     $C_{0[j]}$     8
## 9 $epsilon_{ij}$     9

This model has the following variables:

Variable name Meaning Value Where to find
\(i\) the group - -
\(j\) the item - -
\(\beta_{0}\) the intercept term 3.304181 fixed effects -> intercept -> estimate
\(\beta_{1}\) the fixed effect (slope) for \(\log(\text{prey weight})\) -0.758358 fixed effects -> lprey_weight -> estimat
\(\beta_{2}\) the fixed effect for \(\text{predator species}\) (?) -
\(A_{0[j]}\) the random intercept for item \(j\) (dependent on the haul_id_short term) - -
\(\omega_{A[00]}\) s.d. for A, haul_id_short 1.6472 random effects -> groups -> ices_rectangle -> std. dev.
\(B_{0[j]}\) random intercept for dep. on the year term - -
\(\omega_{B[00]}\) [s.d. for B, year] 1.4235 -
\(\omega_{C[00]}\) [s.d. for C, ices_rectangle] 0.4331 -
\(C_{0[j]}\) dep. on the ices_rectangle term - -
\(\epsilon_{ij}\) the residual/error term for each individual item 1.0729 random effects -> groups -> residual -> std. dev.

Note:

  • \(A_{0[j]}\) is normally distributed with a mean of \(0\) and a standard deviation \(omega_{00}\) (i.e. \(I_{0[j]} \sim N(0,\omega_{A[00]}^2)\))
  • $_{2} = $ should be individual for each pred species??

8.3 Outputs

Outputs:

  • standard deviation for the random effects and residuals (want low values)
  • no. of observations
  • fixed effect, i.e. output of a linear model
  • frequency plot of the residuals (want clustered around resid=0)

Variance of the groups decreases as other random effects are added to the models. But, the error term (standard error) roughly increases as more random effects are added to the model.

8.4 Adding weighting to the models

#Do we need weighting? Bc it's not a density plot

#trial_df <- data.frame(lppmr = 10:5,                
#                 prey_weight_g = "x",
#                 lprey_weight = letters[1:6],
#                 no._prey_per_stmch = "y")
          
#weight_mixed <- lmer(lppmr ~ lprey_weight, data = herringDF, REML=FALSE)

#weighting observations by prey_weight_g
#two <- update(weight_mixed, weights = prey_weight_g)
#summary(two)

#weighting observations by no._prey_per_stmch
#three <- update(weight_mixed, weights = no._prey_per_stmch)
#summary(three)

#anova(two,three)

9 Citations

#citation()

#devtools::session_info()